#------------------------------------------------------------------------------
# clean NVDRS data 
#==============================================================================

# read file
nvdrs = read_fst(file.path(rawdata_path,"NVDRS","NVDRS_PI_2011.fst"), as.data.table=TRUE)
      
# rename key variable names
setnames(nvdrs, 'INCYR_C','Year')
setnames(nvdrs, 'YISITEID','Site')
setnames(nvdrs, 'YIINCID','IncID')
setnames(nvdrs, 'YIPERID','PerID')
setnames(nvdrs, 'RSTAT_C','State')
setnames(nvdrs, 'RCNTY_C','Cnty')
    
# Manner of death
nvdrs[MANNER_C == "01", DthTypN := 1] #suicide
nvdrs[MANNER_C == "02", DthTypN := 2] #homicide
nvdrs[MANNER_C == "09", DthTypN := 3] #undetermined
nvdrs[MANNER_C == "03"  | MANNER_C == "22", DthTypN := 4] #unintentional
nvdrs[MANNER_C == "06", DthTypN := 5] #legal intervention
nvdrs[MANNER_C == "88", DthTypN := 6] #natural
    
# Rename/reformat
setnames(nvdrs, 'DSTATE_C','DthSt')
setnames(nvdrs, 'HEALTH','PhysProb')
    
# sex
nvdrs[, Sex := SEX_C]
nvdrs[SEX_C == 9, Sex := NA]

# age
nvdrs[, Age := as.numeric(AgeYrs_C)]
nvdrs[Age == 999, Age := NA] 

# age groups
nvdrs[Age <  15 , AgeGrp4 := 0] 
nvdrs[Age >= 15 & Age < 25, AgeGrp4 := 1] 
nvdrs[Age >= 25 & Age < 45, AgeGrp4 := 2] 
nvdrs[Age >= 45 & Age < 65, AgeGrp4 := 3] 
nvdrs[Age >= 65, AgeGrp4 := 4]

# born in us
nvdrs[as.numeric(BPLACE_C1) %in% 1:54, BornUSA := 1]
nvdrs[as.numeric(BPLACE_C1) %in% c(55, 56, 57, 61, 62, 88),BornUSA := 0] 

# race
nvdrs[RACE_C1 == 1, Race4 := 1] # white
nvdrs[RACE_C1 == 2, Race4 := 2] # black
nvdrs[RACE_C1 == 3, Race4 := 3] # native
nvdrs[RACE_C1 == 4, Race4 := 4] # asian

# hispanic
nvdrs[ETHNCTY_C == 0, Hisp := 0]
nvdrs[ETHNCTY_C == 1, Hisp := 1]

# marital status
nvdrs[MS_C == 1,MarStat6 := 1] # married
nvdrs[MS_C == 3,MarStat6 := 2] # widowed
nvdrs[MS_C == 4,MarStat6 := 3] # divorced
nvdrs[MS_C == 5,MarStat6 := 4] # separated
nvdrs[MS_C == 2,MarStat6 := 5] # never married 
nvdrs[MS_C == 6,MarStat6 := 6] # other single

# unemployment status; first cleaning the text field using refinr
nvdrs[,MP_occupation := key_collision_merge(MP_OCCUP)]
nvdrs[,MP_occupation_ngram := n_gram_merge(MP_occupation)]

nvdrs[,PP_occupation := key_collision_merge(PP_OCCUP)]
nvdrs[,PP_occupation_ngram := n_gram_merge(PP_occupation)]

# define unemployment based on some words combinations | differentiation missing from unemployed
words_unemployed = c('Never Work','Never Emp','None','Not wor','Not In Workforce',
'Not Emp|Unemp|Un Employeed|Umeployed|Umemp')
words_NA = c('Unk','$Unknown^','Ukn','Not Available','Blank','Not Applicable','Missing',
    '$NA^','$N/A^','Not Specified','999')

nvdrs[,unemployed_MP := as.integer(grepl(paste0(words_unemployed,collapse = '|'),MP_occupation_ngram,ignore.case = TRUE))]
nvdrs[,unemployed_MP_NA := as.integer(grepl(paste0(words_NA,collapse = '|'),MP_occupation_ngram,ignore.case = TRUE))]
nvdrs[MP_occupation_ngram=='',unemployed_MP_NA := 1]

# redefine employment
nvdrs[grepl("empl",MP_occupation_ngram,ignore.case = TRUE) & unemployed_MP_NA == 1,unemployed_MP_NA := 0]
nvdrs[grepl('DUNKIN DONUTS|dispatcher|officer|junk|laborer|Position|Job|Student|Retired|Retail|Operator|Laborer|Labor|Junk|Officer|Empkoyed|Dunkin|Dispatcher',
    MP_occupation_ngram) & unemployed_MP_NA == 1, unemployed_MP_NA := 0]
nvdrs[unemployed_MP_NA == 1,  unemployed_MP := NA]

nvdrs[,unemployed_PP := as.integer(grepl(paste0(words_unemployed,collapse = '|'),PP_occupation_ngram,ignore.case = TRUE))]
nvdrs[,unemployed_PP_NA := as.integer(grepl(paste0(words_NA,collapse = '|'),PP_occupation_ngram,ignore.case = TRUE))]
nvdrs[PP_occupation_ngram=='',unemployed_PP_NA := 1]

# redefine employment
nvdrs[grepl("empl",PP_occupation_ngram,ignore.case = TRUE) & unemployed_PP_NA == 1,unemployed_PP_NA := 0]
nvdrs[grepl('DUNKIN DONUTS|dispatcher|officer|junk|laborer|Position|Job|Student|Retired|Retail|Operator|Laborer|Labor|Junk|Officer|Empkoyed|Dunkin|Dispatcher',
    PP_occupation_ngram) & unemployed_PP_NA == 1, unemployed_PP_NA := 0]
nvdrs[unemployed_PP_NA == 1,  unemployed_PP := NA]

# combine two different data sources
nvdrs[,unemployed := unemployed_PP]
nvdrs[is.na(unemployed_PP),unemployed := unemployed_MP]
nvdrs[unemployed_MP == 1,unemployed := 1]

nvdrs[,UnEmpl := unemployed]
nvdrs[,UnEmpl_M := unemployed_MP]
nvdrs[,UnEmpl_P := unemployed_PP]

# physical problems
nvdrs[,PhysProb := HEALTH_C]
nvdrs[CIRCUM_C == 0 & PhysProb == 0,PhysProb := NA]

# subset data to suicides,  years 2005:2011,  particular states,  person type(?),  and known counties
nvdrs = nvdrs[DthTypN == 1, ]
nvdrs = nvdrs[Year %in% c(2005:2011), ]
nvdrs = nvdrs[as.numeric(State) %in% c(2, 8, 13, 21, 24, 25, 34, 35, 37, 40, 41, 44, 45, 49, 51, 55), ]
nvdrs = nvdrs[PTYPE_C %in% c(1, 3),] # exclude "non-victim (i.e. suspect?)"
nvdrs = nvdrs[Cnty != "999",] # exclude unknown counties

# remove unnecessary variables
keep = c("Year", "Site", "IncID", "PerID", "State", "Cnty", "DthTypN", "DthSt", 
     "Sex", "Age", "AgeGrp4", "BornUSA", "Race4", "Hisp", "MarStat6", "UnEmpl", "PhysProb")
nvdrs.final <- nvdrs[, c(keep), with=F]
    
# add death indicators
nvdrs.final$DSID <- 1        
    
# save results
write_fst(nvdrs.final, file.path(processed_path, "nvdrs_v1.fst"), 100)


